{
  "cells": [
    {
      "metadata": {
        "_uuid": "447978a2af25d44f0f9a69c3217efde89b40a22c",
        "_execution_state": "idle",
        "trusted": true
      },
      "cell_type": "code",
      "source": "%%time\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\nsns.set_style('whitegrid')\nimport os\nimport sys\nimport time\nimport datetime\nfrom tqdm import tqdm\nimport lightgbm as lgb\nimport operator\nimport xgboost as xgb\nimport matplotlib.pyplot as plt\nfrom catboost import CatBoostRegressor\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn import metrics\nfrom imblearn.under_sampling import RandomUnderSampler\nimport warnings\nfrom imblearn.over_sampling import SMOTE\nfrom scipy.stats import ks_2samp\nfrom sklearn import manifold\nwarnings.filterwarnings(\"ignore\")\n\nprint(os.listdir(\"../input\"))\ntrain = pd.read_csv('../input/train.csv')\ntest = pd.read_csv('../input/test.csv')",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "_uuid": "844c8bd7e0be4afa347bbb8a9fafc64daad7645e"
      },
      "cell_type": "code",
      "source": "#EDA\nprint('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')\n#数据大小\nprint('Rows: ',train.shape[0],'Columns: ',train.shape[1])\nprint(train.head())\nprint('Rows: ',test.shape[0],'Columns: ',test.shape[1])\nprint(test.head())",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "_uuid": "4a3a4eeeb18edb6da2d98d4320e15d1609592223"
      },
      "cell_type": "code",
      "source": "print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')\n#label的分布\nprint(train['target'].value_counts())\nsns.countplot(train['target'])\nsns.set_style('whitegrid')",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "_uuid": "96b9b5c75f46ef007fdf1295d3955eaaa2aaf0e7"
      },
      "cell_type": "code",
      "source": "print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')\n#每个特征的缺省值\ntotal = train.isnull().sum().sort_values(ascending = False)\npercent = (train.isnull().sum()/train.isnull().count()*100).sort_values(ascending = False)\nmissing_train_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])\nprint(missing_train_data)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "_uuid": "512ebe1e26c27aadf8e889abf6de6f9e401f0973"
      },
      "cell_type": "code",
      "source": "print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')\n#每个特征下的数据取值不同个数\nfor col in train.columns[2:]:\n    print(\"Number of unique values of {} : {}\".format(col, train[col].nunique()))\n#看一下var_68这个特征\nprint('-------------------------------')\nprint(train['var_68'].value_counts()) \nprint('-------------------------------')\nprint(test['var_68'].value_counts())   ",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "_uuid": "8e3dc881a74f91e7a219489b982aa1cac3ac3ef4"
      },
      "cell_type": "code",
      "source": "print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')\n#每一个特征和label之间的相关系数\ncorr = train.corr()\nprint(abs(corr['target']).sort_values(ascending=False))",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "_uuid": "09c1f6626abcef2135e937885a13cc0465483e94"
      },
      "cell_type": "code",
      "source": "print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')\n#每一个特征和label之间的相关可视化\ntarget_mask = train['target'] == 1\nnon_target_mask = train['target'] == 0 \nstatistics_array = []\nfor col in train.columns[2:]:\n    statistic, pvalue = ks_2samp(train.loc[non_target_mask, col], train.loc[target_mask, col])\n    statistics_array.append(statistic)\n    fig, ax = plt.subplots(1, 1, figsize=(10, 4))\n    sns.kdeplot(train.loc[non_target_mask, col], ax=ax, label='Target == 0')\n    sns.kdeplot(train.loc[target_mask, col], ax=ax, label='Target == 1')\n\n    ax.set_title('name: {}, statistics: {:.5f}, pvalue: {:5f}'.format(col, statistic, pvalue))\n    plt.show()",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "_uuid": "2b587ff04bf045445f4808f4665256d1e1a08676"
      },
      "cell_type": "code",
      "source": "test_x = test.drop(['ID_code','var_185','var_27','var_30','var_17','var_38','var_41','var_126','var_103'],axis=1)\ntrain_x = train.drop(['ID_code','target','var_185','var_27','var_30','var_17','var_38','var_41','var_126','var_103'],axis=1)\ntrain_y = train['target']",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "_uuid": "39b400e1ea12261dba6cd3a7c88794de9648ce5b"
      },
      "cell_type": "code",
      "source": "# #归一化\n# scaler = StandardScaler()\n# train_scaler_x = scaler.fit_transform(train_x)\n# test_scaler_x = scaler.transform(test_x)\n# train_x = pd.DataFrame(train_scaler_x,columns=train_x.columns.tolist())\n# test_x = pd.DataFrame(test_scaler_x,columns=train_x.columns.tolist())",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "_uuid": "262c8adacfd70a2c7136226783fd1b1e0c31df43"
      },
      "cell_type": "code",
      "source": "n_fold = 5\nfolds = KFold(n_splits=n_fold, shuffle=True)\n# #欠采样\n# ros=RandomUnderSampler(random_state=42)\n# x_resampled,y_resampled=ros.fit_sample(train_x.values,train_y.values)\n# x_resampled = pd.DataFrame(x_resampled,columns=train_x.columns.tolist())\n# # #过采样\n# # x_resampled, y_resampled = SMOTE(kind='borderline1').fit_sample(train_x.values, train_y.values)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "_uuid": "6a834aadef17130aeb8690fd61f72e8e02575c05"
      },
      "cell_type": "code",
      "source": "# # 后项搜索\n# def modeling_cross_validation(X=x_resampled.values ,y=y_resampled,featurename=train_x.columns.tolist(),params=None, folds=folds, model_type='lgb',model=None):\n#     scores = []\n#     feature_importance = pd.DataFrame()\n#     for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):\n#         print('Fold', fold_n, 'started at', time.ctime())\n#         X_train, X_valid = X[train_index], X[valid_index]\n#         y_train, y_valid = y[train_index], y[valid_index]\n#         if model_type == 'lgb':\n#             train_data = lgb.Dataset(data=X_train, label=y_train)\n#             valid_data = lgb.Dataset(data=X_valid, label=y_valid)\n#             model = lgb.train(params,train_data,num_boost_round=20000,\n#                     valid_sets = [train_data, valid_data],verbose_eval=2000,early_stopping_rounds = 200)\n#             y_pred_valid = model.predict(X_valid)\n            \n#         if model_type == 'xgb':\n#             train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=featurename)\n#             valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=featurename)\n#             watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]\n#             model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=1000, params=params)\n#             y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=featurename), ntree_limit=model.best_ntree_limit)\n            \n#         if model_type == 'rcv':\n#             model = RidgeCV(alphas=(0.01, 0.1, 1.0, 10.0, 100.0), scoring='neg_mean_absolute_error', cv=3)\n#             model.fit(X_train, y_train)\n#             print(model.alpha_)\n\n#             y_pred_valid = model.predict(X_valid).reshape(-1,)\n#             score = mean_absolute_error(y_valid, y_pred_valid)\n        \n#         if model_type == 'sklearn':\n#             model = model\n#             model.fit(X_train, y_train)\n            \n#             y_pred_valid = model.predict(X_valid).reshape(-1,)\n#             score = mean_absolute_error(y_valid, y_pred_valid)\n        \n#         if model_type == 'cat':\n#             model = CatBoostRegressor(iterations=20000,  eval_metric='auc', **params)\n#             model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False)\n#             y_pred_valid = model.predict(X_valid)\n        \n#         fpr, tpr, thresholds = metrics.roc_curve(y_valid, y_pred_valid, pos_label=1)\n#         scores.append(metrics.auc(fpr, tpr))   \n#     return  np.mean(scores)           \n\n# def featureSelect(init_cols):\n#     params = {'num_leaves': 10,\n#          'min_data_in_leaf': 42,\n#          'objective': 'binary',\n#          'max_depth': 18,\n#          'learning_rate': 0.01,\n#          'boosting': 'gbdt',\n#          'bagging_freq': 6,\n#          'bagging_fraction': 0.8,\n#          'feature_fraction': 0.9,\n#          'bagging_seed': 11,\n#          'reg_alpha': 2,\n#          'reg_lambda': 5,\n#          'random_state': 42,\n#          'metric': 'auc',\n#          'verbosity': -1,\n#          'subsample': 0.9,\n#          'min_gain_to_split': 0.01077313523861969,\n#          'min_child_weight': 19.428902804238373,\n#          'num_threads': 4}\n#     best_cols = init_cols.copy()\n#     best_score = modeling_cross_validation(X=x_resampled[best_cols].values, y=y_resampled,featurename=best_cols,params=params)\n    \n#     print(\"初始CV score: {:<8.8f}\".format(best_score))\n#     for f in init_cols:\n\n#         best_cols.remove(f)\n#         score = modeling_cross_validation(X=x_resampled[best_cols].values, y=y_resampled,featurename=best_cols,params=params)\n#         diff = score - best_score \n#         print('-'*10)\n#         if diff > 0.00005:\n#             print(\"当前移除特征: {}, CV score: {:<8.8f}, 最佳cv score: {:<8.8f}, 有效果,删除！！\".format(f,score,best_score))\n#             best_score = score\n#         else:\n#             print(\"当前移除特征: {}, CV score: {:<8.8f}, 最佳cv score: {:<8.8f}, 没效果,保留！！\".format(f,score,best_score))\n#             best_cols.append(f)\n#     print('-'*10)\n#     print(\"优化后CV score: {:<8.8f}\".format(best_score))\n    \n#     return best_cols\n\n# best_features = featureSelect(train_x.columns.tolist())\n# print(best_features)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "_uuid": "396837c090766e5b1e5684c4dd149fe4736de6c3"
      },
      "cell_type": "code",
      "source": "def train_model(X=train_x.values ,y=train_y.values,featurename=train_x.columns.tolist(), X_test=test_x, params=None, folds=folds, model_type='lgb', plot_feature_importance=False, model=None):\n    oof = np.zeros(len(X))\n    prediction = np.zeros(len(X_test))\n    scores = []\n    feature_importance = pd.DataFrame()\n    for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):\n        print('Fold', fold_n, 'started at', time.ctime())\n        X_train, X_valid = X[train_index], X[valid_index]\n        y_train, y_valid = y[train_index], y[valid_index]\n        if model_type == 'lgb':\n            train_data = lgb.Dataset(data=X_train, label=y_train)\n            valid_data = lgb.Dataset(data=X_valid, label=y_valid)\n            model = lgb.train(params,train_data,num_boost_round=20000,\n                    valid_sets = [train_data, valid_data],verbose_eval=1000,early_stopping_rounds = 200)\n            \n            y_pred_valid = model.predict(X_valid)\n            y_pred = model.predict(X_test, num_iteration=model.best_iteration)\n            \n        if model_type == 'xgb':\n            train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=featurename)\n            valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=featurename)\n            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]\n            model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=1000, params=params)\n            y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=featurename), ntree_limit=model.best_ntree_limit)\n            y_pred = model.predict(xgb.DMatrix(X_test, feature_names=featurename), ntree_limit=model.best_ntree_limit)\n            \n        if model_type == 'rcv':\n            model = RidgeCV(alphas=(0.01, 0.1, 1.0, 10.0, 100.0), scoring='neg_mean_absolute_error', cv=3)\n            model.fit(X_train, y_train)\n            print(model.alpha_)\n\n            y_pred_valid = model.predict(X_valid).reshape(-1,)\n            score = mean_absolute_error(y_valid, y_pred_valid)\n            print(f'Fold {fold_n}. MAE: {score:.4f}.')\n            print('')\n            y_pred = model.predict(X_test).reshape(-1,)\n        \n        if model_type == 'sklearn':\n            model = model\n            model.fit(X_train, y_train)\n            \n            y_pred_valid = model.predict(X_valid).reshape(-1,)\n            score = mean_absolute_error(y_valid, y_pred_valid)\n            print(f'Fold {fold_n}. MAE: {score:.4f}.')\n            print('')\n            \n            y_pred = model.predict(X_test).reshape(-1,)\n        \n        if model_type == 'cat':\n            model = CatBoostRegressor(iterations=20000,  eval_metric='auc', **params)\n            model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False)\n            y_pred_valid = model.predict(X_valid)\n            y_pred = model.predict(X_test)\n        \n        oof[valid_index] = y_pred_valid.reshape(-1,)\n        fpr, tpr, thresholds = metrics.roc_curve(y_valid, y_pred_valid, pos_label=1)\n        scores.append(metrics.auc(fpr, tpr))\n\n        prediction += y_pred    \n        \n        if model_type == 'lgb':\n            # feature importance\n            fold_importance = pd.DataFrame()\n            fold_importance[\"feature\"] = featurename\n            fold_importance[\"importance\"] = model.feature_importance()\n            fold_importance[\"fold\"] = fold_n + 1\n            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)\n        if model_type == 'xgb':\n            fold_importance =  model.get_fscore() \n            fold_importance = sorted(fold_importance.items(), key=operator.itemgetter(1))  \n            feature_importance = pd.DataFrame(fold_importance, columns=['feature', 'importance'])           \n    prediction /= n_fold\n    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))\n    \n    if model_type == 'lgb':\n        feature_importance[\"importance\"] /= n_fold\n        if plot_feature_importance:\n            cols = feature_importance[[\"feature\", \"importance\"]].groupby(\"feature\").mean().sort_values(\n                by=\"importance\", ascending=False)[:50].index\n\n            best_features = feature_importance.loc[feature_importance.feature.isin(cols)]\n\n            plt.figure(figsize=(16,26))\n            sns.barplot(x=\"importance\", y=\"feature\", data=best_features.sort_values(by=\"importance\", ascending=False))\n            plt.title('LGB Features (avg over folds)')\n        \n            return oof, prediction, feature_importance\n        return oof, prediction\n    \n    elif model_type == 'xgb':\n        feature_importance['importance'] /= n_fold\n        if plot_feature_importance:\n            plt.figure(figsize=(16,26))  \n            feature_importance.plot(kind='barh', x='feature', y='importance', legend=False, figsize=(6, 10))  \n            plt.title('XGB Features (avg over folds)')  \n            plt.xlabel('relative importance')  \n            plt.show() \n            return oof, prediction, feature_importance\n        return oof, prediction\n    else:\n        return oof, prediction",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "_uuid": "463c8d67a1208c3682cd50bf1489a88351292133"
      },
      "cell_type": "code",
      "source": "params = {'num_leaves': 10,\n         'min_data_in_leaf': 42,\n         'objective': 'binary',\n         'max_depth': 18,\n         'learning_rate': 0.01,\n         'boosting': 'gbdt',\n         'bagging_freq': 6,\n         'bagging_fraction': 0.8,\n         'feature_fraction': 0.9,\n         'bagging_seed': 11,\n         'reg_alpha': 2,\n         'reg_lambda': 5,\n         'random_state': 42,\n         'metric': 'auc',\n         'verbosity': -1,\n         'subsample': 0.9,\n         'min_gain_to_split': 0.01077313523861969,\n         'min_child_weight': 19.428902804238373,\n         'num_threads': 4}\noof_lgb, prediction_lgb, feature_importance_lgb = train_model(params=params, model_type='lgb',plot_feature_importance=True)\n# params = {'eta': 0.05, \n#               'max_depth': 3, \n#               'subsample': 0.9, \n#               'colsample_bytree': 0.9, \n#               'objective': 'binary:logistic', \n#               'eval_metric': 'auc', \n#               'silent': True, \n#               'nthread': 4}\n# oof_xgb, prediction_xgb, feature_importance_xgb = train_model(params=params, model_type='xgb',plot_feature_importance=True)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "_uuid": "fb806b84b66b768894d9d2d9d9e1bdf5ea55075a"
      },
      "cell_type": "code",
      "source": "submission = pd.DataFrame({\"ID_code\": test.ID_code.values})\nsubmission[\"target\"] = prediction_lgb\nsubmission.to_csv(\"submission.csv\", index=False)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "_uuid": "cd7236bad66740a8cbdb0c8c7a106b13a5c8f7ac"
      },
      "cell_type": "code",
      "source": "\n",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "_uuid": "540c2d47a225357c23a33f365ae8cd10af2c599d"
      },
      "cell_type": "code",
      "source": "",
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "name": "python",
      "version": "3.6.6",
      "mimetype": "text/x-python",
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "pygments_lexer": "ipython3",
      "nbconvert_exporter": "python",
      "file_extension": ".py"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 1
}